import json

import jieba
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from domain.default_news import news_data, news_after_treat_data


def stop_word_list():
    '''
    用于获取停词
    :return: 停词列表
    '''
    stop_words = [line.strip() for line in open('./data/stopwords.txt', encoding='UTF-8').readlines()]
    return stop_words


def seg_depart(sentence):
    '''
    开始分词
    :param sentence: 输入的文章
    :return: 分词列表
    '''
    sentence_depart = jieba.cut(sentence.strip())
    stop_words = stop_word_list()
    out_list = []
    for word in sentence_depart:
        if word not in stop_words:
            if (word != '\t') & (word != '\ufeff') & (word != ' ') & (word != '\n') & (word != '　'):
                out_list.append(word)
    return out_list


if __name__ == '__main__':
    engine = create_engine('sqlite:///database/news.db')
    Session = sessionmaker(bind=engine)
    session = Session()
    news_list = session.query(news_data).all()
    for i in range(len(news_list)):
        word_list = seg_depart(news_list[i].text)
        add_new = news_after_treat_data(type=news_list[i].type, json=json.dumps(word_list, ensure_ascii=False),
                                        id=news_list[i].id)
        session.add(add_new)
        session.commit()
        print("当前进度：" + str(i+1) + "/" + str(len(news_list)))
    # word_list = seg_depart(new_text)
    # print(word_list)
    session.close()